## To Hide or Show Code Blocks
from IPython.display import HTML
hidecode=HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
hidecode
import glob, os
import dill
# print("CSV Files present in the directory:\n", '\n'.join(glob.glob(CheckPoint_Dir + '*.{}'.format('CheckPt')))) #../input
def WorkspaceBasedCheckPt(CheckPtPosition = 0, AccessWorkSpaceImage = False, CheckPoint_Dir = "NotebookCheckpoints/"):
if not os.path.exists(CheckPoint_Dir):
os.makedirs(CheckPoint_Dir)
file= open((CheckPoint_Dir + '0__CheckPt.db'),"w+")
file.close()
LastCheckPt = max([int(CheckPoint.split('/')[len(CheckPoint.split('/')) -1].split('__')[0]) for CheckPoint in glob.glob(CheckPoint_Dir + '*{}'.format('CheckPt.db'))])
## This Code below is used to handle the actions on saving/loading the workspace images
if (AccessWorkSpaceImage & (CheckPtPosition == 0)):
print("Initial Phase, hence not saving workspace.")
elif (AccessWorkSpaceImage & (LastCheckPt < CheckPtPosition)):
dill.dump_session(CheckPoint_Dir + str(CheckPtPosition) + '__CheckPt.db') # To Save a session
print("Congrats, This is a new CheckPoint hence saved.")
elif (AccessWorkSpaceImage & (LastCheckPt >= CheckPtPosition)):
dill.load_session(CheckPoint_Dir + str(CheckPtPosition) + '__CheckPt.db') # To restore a session
print("This CheckPoint exist hence loaded")
## This Code below is used to handle the actions on returning the value to run/not run the cell
### = is set so that the current check point cell code are able to run
if (LastCheckPt < CheckPtPosition):
print('Running this cell')
return True
elif (LastCheckPt >= CheckPtPosition):
print('Most Recent Checkpoint is : {} \nHence, not running.'.format(LastCheckPt))
return False
# https://stackoverflow.com/questions/26873127/show-dataframe-as-table-in-ipython-notebook/29665452
import sys
print("System Version")
print(sys.version)
os.getcwd()
## Getting and setting the current directory as working directory
# os.chdir(os.getcwd())
if WorkspaceBasedCheckPt(0):
WorkspaceBasedCheckPt(0, True)

Data can be downloaded from this link.
The data is present in a CSV with emoticons removed. This file contains 6 fields:

![]()
Reference-style:
![]()

if WorkspaceBasedCheckPt(1):
config = {
'input_dir': 'inputs/',
'training_file': 'training.1600000.processed.noemoticon.csv',
'test_file':'testdata.manual.2009.06.14.csv',
'training_file_cleaned': 'CleanedTrainingData.csv',
'test_file_cleaned': 'CleanedTestData.csv'
# 'FileLocalSavingName':"DataClustering.csv",
# 'DimensionalityTransformationAlgo':['PCA', 'ICA'],
# 'bq_env': {'edit_query': 'Yes',
# 'bq_query_template_file': 'QueryTemplateClustering.txt',
# 'sid': ['1071'] ,
# 'date': ['010218'],
# 'MaxNoObsToGet': '1000000'},
# 'Trial':["ABC","DEF","GHI"],
}
# for i in dict.keys():
# for j in dict[i]:
# print(i, " ", j, " ",dict[i][0])
# from datetime import date, timedelta
# CurrentDate = date((2000 + int(TableDate[0][4:6])), int(TableDate[0][2:4]),int(TableDate[0][0:2]))
# format = "%d%m%y" #"%a %b %d %H:%M:%S %Y"
# TablesToCheck = [(CurrentDate + timedelta(days=i)).strftime(format) for i in range(int(TableWindow))]
# TablesToCheck
## Saving this checkpoint
WorkspaceBasedCheckPt(1, True)
WorkspaceBasedCheckPt(1, True)
WorkspaceBasedCheckPt(1)
WorkspaceBasedCheckPt(2)
1. Importing the Libraries## Moving towards checkpoint 2
if WorkspaceBasedCheckPt(2):
import time
# start_time = time.time()
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
pd.options.display.max_columns = 999
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
plt.style.use('fivethirtyeight')
%matplotlib inline
# %matplotlib notebook
%config InlineBackend.figure_format = 'retina'
# HeadingBreaker = "======================================================================================\n"
# SubHeadingBreaker = "--------------------------------------------------------------------------------------\n"
# NotebookProcessCatalog = "Process Followed Log\n" + HeadingBreaker
# end_time = time.time()
# print('Time taken {:.4f} s'.format(end_time - start_time))
2. Importing the Dataif WorkspaceBasedCheckPt(2):
from subprocess import check_output
# print("CSV Files present in the directory:", check_output(["ls", "*.csv"]).decode("utf8")) #../input
print("CSV Files present in the directory:\n", '\n'.join(glob.glob(config['input_dir'] + '*.{}'.format('csv')))) #../input
if WorkspaceBasedCheckPt(2):
cols = ['sentiment_class','id','date','query_string','user','text']
train_DF = pd.read_csv(config['input_dir'] + config['training_file'], encoding="ISO-8859-1", header=None, names=cols)
test_DF = pd.read_csv(config['input_dir'] + config['test_file'], encoding="ISO-8859-1", header=None, names=cols)
print("Train Dataset Shape :", train_DF.shape)
print("Test Dataset Shape :", test_DF.shape)
display(train_DF.head())
3. Understanding the Dataif WorkspaceBasedCheckPt(2):
print("Observing the structure of Train dataset")
print("Train Dataset Shape :", train_DF.shape)
VarDataType_DF = pd.DataFrame(train_DF.dtypes).reset_index().rename(columns = {'index' : 'VariableName', 0 : 'DataType'})
Miss_DF = pd.DataFrame(train_DF.isnull().sum()).reset_index().rename(columns = {'index' : 'VariableName', 0 : '#OfMissingObs'})
Uniq_DF = pd.DataFrame({'VariableName': [col for col in train_DF.columns], '#OfUniqueValue': [len(train_DF[col].unique()) for col in train_DF.columns]}, columns = ['VariableName', '#OfUniqueValue'])
Prop_DF = train_DF.describe().T.reset_index().rename(columns = {'index': 'VariableName'})
display(VarDataType_DF.set_index('VariableName').join(Miss_DF.set_index('VariableName')).join(Uniq_DF.set_index('VariableName')).join(Prop_DF.set_index('VariableName')).fillna('-').reset_index())
if WorkspaceBasedCheckPt(2):
print("Observing the structure of Test dataset")
print("Test Dataset Shape :", test_DF.shape)
VarDataType_DF = pd.DataFrame(test_DF.dtypes).reset_index().rename(columns = {'index' : 'VariableName', 0 : 'DataType'})
Miss_DF = pd.DataFrame(test_DF.isnull().sum()).reset_index().rename(columns = {'index' : 'VariableName', 0 : '#OfMissingObs'})
Uniq_DF = pd.DataFrame({'VariableName': [col for col in test_DF.columns], '#OfUniqueValue': [len(test_DF[col].unique()) for col in test_DF.columns]}, columns = ['VariableName', '#OfUniqueValue'])
Prop_DF = test_DF.describe().T.reset_index().rename(columns = {'index': 'VariableName'})
display(VarDataType_DF.set_index('VariableName').join(Miss_DF.set_index('VariableName')).join(Prop_DF.set_index('VariableName')).fillna('-').reset_index())
if WorkspaceBasedCheckPt(2):
print("Printing top 10 obs/class in each object variable in Trainset")
for VarName in train_DF.columns:
if (train_DF[VarName].dtypes == object):
print("\n***********************************************************************")
print("\nAnalyzing the Variable:", VarName)
print("Variable datatype:", train_DF[VarName].dtypes)
# print("Missing Values:", train_DF[VarName].isnull().sum())
# print(TrainDF[VarName].describe())
display(pd.DataFrame(train_DF[VarName].value_counts()).reset_index().rename(columns = {'index' : 'Value', VarName : 'Frequency'}).head(10))
# if ((TrainDF[VarName].dtypes == float) | (TrainDF[VarName].dtypes == int)):
# print(pd.DataFrame(TrainDF[VarName].describe()).reset_index().rename(columns = {'index' : 'Property'}))
# print(TrainDF[VarName].value_counts()
if WorkspaceBasedCheckPt(2):
print("Printing top 10 obs/class in each object variable in Trainset")
for VarName in test_DF.columns:
if (test_DF[VarName].dtypes == object):
print("\n***********************************************************************")
print("\nAnalyzing the Variable:", VarName)
print("Variable datatype:", test_DF[VarName].dtypes)
# print("Missing Values:", test_DF[VarName].isnull().sum())
# print(test_DF[VarName].describe())
display(pd.DataFrame(test_DF[VarName].value_counts()).reset_index().rename(columns = {'index' : 'Value', VarName : 'Frequency'}).head(10))
# if ((test_DF[VarName].dtypes == float) | (test_DF[VarName].dtypes == int)):
# print(pd.DataFrame(test_DF[VarName].describe()).reset_index().rename(columns = {'index' : 'Property'}))
# print(test_DF[VarName].value_counts()
Our reponse variable is "sentiment_class" so let's observe the content with its class
if WorkspaceBasedCheckPt(2):
display(train_DF["sentiment_class"].value_counts().reset_index().rename(columns = {'index' : 'Sentiment_Class_Value', 'sentiment_class' : 'Frequency'}))
if WorkspaceBasedCheckPt(2):
pd.set_option('display.height', 10000)
pd.set_option('display.max_colwidth', 180)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 500)
display(train_DF.loc[train_DF["sentiment_class"] == 0].head(10))
if WorkspaceBasedCheckPt(2):
display(train_DF.loc[train_DF["sentiment_class"] == 4].head(10))
4. Data Preprocessingif WorkspaceBasedCheckPt(2):
# del()
## To Trainset
train_DF.drop(['id','date','query_string','user'],axis=1,inplace=True)
## To Testset
test_DF.drop(['id','date','query_string','user'],axis=1,inplace=True)
if WorkspaceBasedCheckPt(2):
print("Trainset Shape:", train_DF.shape)
print("Trainset Shape:", test_DF.shape)
if WorkspaceBasedCheckPt(2):
## To Trainset
train_DF['sentiment_class'] = train_DF['sentiment_class'].map({0: 0, 4: 1})
## To Testset
test_DF['sentiment_class'] = test_DF['sentiment_class'].map({0: 0, 4: 1})
display(train_DF["sentiment_class"].value_counts().reset_index().rename(columns = {'index' : 'Sentiment_Class_Value', 'sentiment_class' : 'Frequency'}))
if WorkspaceBasedCheckPt(2):
## To Trainset
train_DF['pre_clean_len_text'] = [len(le) for le in train_DF.text]
## To Testset
test_DF['pre_clean_len_text'] = [len(le) for le in test_DF.text]
print("Updated Shape", train_DF.shape)
display(train_DF.head())
if WorkspaceBasedCheckPt(2):
f, (ax_box, ax_hist) = plt.subplots(figsize=(12, 6), nrows=2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(train_DF['pre_clean_len_text'], ax=ax_box)
sns.distplot(train_DF['pre_clean_len_text'], ax=ax_hist, bins = 350)
ax_box.set(xlabel='')
if WorkspaceBasedCheckPt(2):
display(train_DF.loc[train_DF['pre_clean_len_text'] > 140].head())
if WorkspaceBasedCheckPt(2):
display(train_DF['text'][343])
if WorkspaceBasedCheckPt(2):
import re
display(re.sub(r'@[A-Za-z0-9_]+','',train_DF['text'][343]))
# [word for word in train_DF['text'][343].split(' ') if word[0] != '@']
if WorkspaceBasedCheckPt(2):
display(train_DF['text'][343])
if WorkspaceBasedCheckPt(2):
from bs4 import BeautifulSoup
example1 = BeautifulSoup(train_DF['text'][343], 'lxml')
display(example1.get_text())
if WorkspaceBasedCheckPt(2):
display(train_DF['text'][226])
if WorkspaceBasedCheckPt(2):
display(train_DF['text'][226].encode("utf-8-sig"))#.decode('utf-8-sig') #.decode('utf_8')
if WorkspaceBasedCheckPt(2):
# train_DF['text'][226].replace(u"�", "?")
display(re.sub('�[A-Za-z0-9]+ ', ' ?? ',train_DF['text'][226]))
if WorkspaceBasedCheckPt(2):
display(train_DF['text'][50])

if WorkspaceBasedCheckPt(2):
# ^ matches position just before the first character of the string
# $ matches position just after the last character of the string
# . matches a single character. Does not matter what character it is, except newline
# * matches preceding match zero or more times
# re.sub('https?://[A-Za-z0-9./]+ ', '',train_DF['text'][50])
display(re.sub('https?://[^ ]+', '',train_DF['text'][50]))
if WorkspaceBasedCheckPt(2):
display(re.sub('www.[^ ]+', '',train_DF['text'][50]))
if WorkspaceBasedCheckPt(2):
display(train_DF.text[175].lower())
if WorkspaceBasedCheckPt(2):
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
"haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
"wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
"can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
"mustn't":"must not", "i'm":"i am"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')
display(neg_pattern.sub(lambda x: negations_dic[x.group()], train_DF.text[175].lower()))
if WorkspaceBasedCheckPt(2):
display(train_DF.text[175])
if WorkspaceBasedCheckPt(2):
display(re.sub('[^a-zA-Z]', ' ', train_DF.text[175]))
Tokenization, stemming/lemmatization, stop words will be dealt with later stage when creating matrix with either count vectorizer or Tfidf vectorize
if WorkspaceBasedCheckPt(2):
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()
user_pat = r'@[A-Za-z0-9_]+'
# combined_pat = r'|'.join((pat1, pat2))
decode_pat = r'�[A-Za-z0-9]+ '
http_pat = r'https?://[^ ]+'
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
"haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
"wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
"can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
"mustn't":"must not", "i'm":"i am", "you're": "you are", "we're": "we are",
"i've": "i have", "i'll": "i will", "ill": "i will", "cant":"can not", "dont":"do not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')
def TweetTextCleaner(text):
soup = BeautifulSoup(text, 'lxml')
TextInClean = soup.get_text()
TextInClean = re.sub(user_pat, ' ', TextInClean)
TextInClean = re.sub(decode_pat, ' ', TextInClean)
TextInClean = re.sub(http_pat, ' ', TextInClean)
TextInClean = re.sub(www_pat, ' ', TextInClean)
TextInClean = TextInClean.lower()
TextInClean = neg_pattern.sub(lambda x: negations_dic[x.group()], TextInClean)
## adding some extra cases
TextInClean = TextInClean.replace("&", "and")
TextInClean = TextInClean.replace("i", "iii")
## removing single characters also
TextInClean = re.sub('[^a-zA-Z]', ' ', TextInClean)
## tokenizing and joining together to remove unneccessary white spaces
TextInClean = [x for x in tok.tokenize(TextInClean) if len(x) > 1]
## Combining it back
TextInClean = (' '.join(TextInClean))
TextInClean = TextInClean.replace("iii", "i")
return TextInClean
Testing on the cases mentioned above.
if WorkspaceBasedCheckPt(2):
display(train_DF.text[[0, 50, 175, 226, 343]])
if WorkspaceBasedCheckPt(2):
display([TweetTextCleaner(text) for text in train_DF.text[[0, 50, 175, 226, 343]]])
Testing on some more cases.
if WorkspaceBasedCheckPt(2):
testing = train_DF.text[:50]
display([TweetTextCleaner(text) for text in testing])
%%time
if WorkspaceBasedCheckPt(2):
print("Cleaning and parsing the tweets...\n")
Cleaned_Text = []
for i in range(0, train_DF.shape[0], int(train_DF.shape[0] * 0.01)):
start_time = time.time()
for obs in range(i, (i+int(train_DF.shape[0] * 0.01))):
Cleaned_Text.append(TweetTextCleaner(train_DF['text'][obs]))
end_time = time.time()
print("[{}% Completed, Time Taken {}sec] Tweets {} to {} of the total {} has been processed. ".format(int(i/int(train_DF.shape[0] * 0.01)) + 1, int(end_time - start_time), i, i+int(train_DF.shape[0] * 0.01), train_DF.shape[0]))
print("Checking Final Length of the Cleaned Text ", len(Cleaned_Text))
if WorkspaceBasedCheckPt(2):
train_DF['clean_text'] = Cleaned_Text
train_DF['clean_len_text'] = [len(le) for le in train_DF.clean_text]
print("Updated Shape", train_DF.shape)
display(train_DF.head())
if WorkspaceBasedCheckPt(2):
f, (ax_box, ax_hist) = plt.subplots(figsize=(12, 6), nrows=2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
PCLB = sns.boxplot(train_DF['pre_clean_len_text'], ax=ax_box, color = 'blue')
PCLD = sns.distplot(train_DF['pre_clean_len_text'], ax=ax_hist, bins = 350, color = 'blue')
PCLB.set_title("Unclean Text Length")
PCLD.set(xlim=(0, 400))
ax_box.set(xlabel='')
f, (ax_box, ax_hist) = plt.subplots(figsize=(12, 6), nrows=2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
CLB = sns.boxplot(train_DF['clean_len_text'], ax=ax_box, color = 'red')
CLD = sns.distplot(train_DF['clean_len_text'], ax=ax_hist, bins = 150, color = 'red')
CLB.set_title("Clean Text Length")
CLD.set(xlim=(0, 400))
ax_box.set(xlabel='')
if WorkspaceBasedCheckPt(2):
display(train_DF.loc[train_DF['clean_len_text'] > 140].head())
All of this is cause due to the expansion of the general words to their longer version
%%time
if WorkspaceBasedCheckPt(2):
Cleaned_TestText = [TweetTextCleaner(obs) for obs in test_DF['text']]
print("Total Number of Observations processed", len(Cleaned_TestText))
if WorkspaceBasedCheckPt(2):
test_DF['clean_text'] = Cleaned_TestText
test_DF['clean_len_text'] = [len(le) for le in test_DF.clean_text]
print("Updated Shape", test_DF.shape)
if WorkspaceBasedCheckPt(2):
print("Dataset Shape:", train_DF.shape)
display(train_DF.head())
if WorkspaceBasedCheckPt(2):
train_DF.to_csv(config['input_dir'] + config['training_file_cleaned'], sep='\t', index=False)
test_DF.to_csv(config['input_dir'] + config['test_file_cleaned'], sep='\t', index=False)
if WorkspaceBasedCheckPt(2):
checkDF = pd.read_csv(config['input_dir'] + config['training_file_cleaned'], sep='\t')
print("Dataset Shape: ", checkDF.shape)
display(checkDF.head())
if WorkspaceBasedCheckPt(2):
print("Observing the structure of saved Train dataset")
print("Train Dataset Shape :", checkDF.shape)
VarDataType_DF = pd.DataFrame(checkDF.dtypes).reset_index().rename(columns = {'index' : 'VariableName', 0 : 'DataType'})
Miss_DF = pd.DataFrame(checkDF.isnull().sum()).reset_index().rename(columns = {'index' : 'VariableName', 0 : '#OfMissingObs'})
Uniq_DF = pd.DataFrame({'VariableName': [col for col in checkDF.columns], '#OfUniqueValue': [len(checkDF[col].unique()) for col in checkDF.columns]}, columns = ['VariableName', '#OfUniqueValue'])
Prop_DF = checkDF.describe().T.reset_index().rename(columns = {'index': 'VariableName'})
display(VarDataType_DF.set_index('VariableName').join(Miss_DF.set_index('VariableName')).join(Uniq_DF.set_index('VariableName')).join(Prop_DF.set_index('VariableName')).fillna('-').reset_index())
if WorkspaceBasedCheckPt(2):
display(checkDF.loc[checkDF['clean_text'].isnull()].head(15))
thing to note here is negative sentiment are associated when someperson has been tagged in general. Also we now have observed the reason for some clean text to be empty.
WorkspaceBasedCheckPt(2, True)
WorkspaceBasedCheckPt(2, True)
WorkspaceBasedCheckPt(2)
### Ways to view defined variables
# globals()
# locals()
# dir()
# who
if WorkspaceBasedCheckPt(3):
## Cleaning Workspace
del(CLB, CLD, PCLB, PCLD, Prop_DF, Uniq_DF, Miss_DF, VarDataType_DF, ax_hist, ax_box, VarName, TweetTextCleaner, Cleaned_Text, Cleaned_TestText, checkDF, decode_pat, example1, http_pat, neg_pattern, negations_dic, obs, testing, user_pat, www_pat)
whos
if WorkspaceBasedCheckPt(3):
string_list = train_DF['clean_text'].loc[train_DF['sentiment_class'] == 0].tolist()
SinglePooledString = " ".join(string_list)
print("Type:", type(SinglePooledString))
print("Length:", len(SinglePooledString))
if WorkspaceBasedCheckPt(3):
from wordcloud import WordCloud
wordcloud = WordCloud(width=1600, height=800,max_font_size=200, colormap='magma').generate(SinglePooledString)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
if WorkspaceBasedCheckPt(3):
[text for text in string_list if 'love' in text][0:15]
if WorkspaceBasedCheckPt(3):
string_list = train_DF['clean_text'].loc[train_DF['sentiment_class'] == 1].tolist()
SinglePooledString = " ".join(string_list)
print("Type:", type(SinglePooledString))
print("Length:", len(SinglePooledString))
if WorkspaceBasedCheckPt(3):
wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(SinglePooledString)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
if WorkspaceBasedCheckPt(3):
## Cleaning the Workspace
del(string_list, SinglePooledString, wordcloud, WordCloud)
if WorkspaceBasedCheckPt(3):
from sklearn.feature_extraction.text import CountVectorizer
CntVec = CountVectorizer()
CntVec.fit(train_DF['clean_text'])
if WorkspaceBasedCheckPt(3):
len(CntVec.get_feature_names())
it looks like count vectorizer has extracted 267001 words out of the corpus
if WorkspaceBasedCheckPt(3):
NegativeDoc_matrix = CntVec.transform(train_DF['clean_text'].loc[train_DF['sentiment_class'] == 0])
PositiveDoc_matrix = CntVec.transform(train_DF['clean_text'].loc[train_DF['sentiment_class'] == 1])
print(type(NegativeDoc_matrix), NegativeDoc_matrix.shape)
Negative_tf = np.sum(NegativeDoc_matrix,axis=0)
Positive_tf = np.sum(PositiveDoc_matrix,axis=0)
print(type(Negative_tf), Negative_tf.shape)
Negative = np.squeeze(np.asarray(Negative_tf))
Positive = np.squeeze(np.asarray(Positive_tf))
print(type(Negative), Negative.shape)
TermFreq_DF = pd.DataFrame([Negative,Positive], columns=CntVec.get_feature_names()).transpose()
print(type(TermFreq_DF), TermFreq_DF.shape)
TermFreq_DF.rename(columns = {0:'Negative', 1:'Positive'}, inplace= True)
TermFreq_DF['TotalFreq'] = TermFreq_DF['Negative'] + TermFreq_DF['Positive']
print('DataFrame Shape:', TermFreq_DF.shape)
display(TermFreq_DF.sort_values(by='TotalFreq', ascending=False).head(15))
### Saving to CSV
Zipf's Law is first presented by French stenographer Jean-Baptiste Estoup and later named after the American linguist George Kingsley Zipf. Zipf's Law states that a small number of words are used all the time, while the vast majority are used very rarely. There is nothing surprising about this, we know that we use some of the words very frequently, such as "the", "of", etc, and we rarely use the words like "aardvark" (aardvark is an animal species native to Africa). However, what's interesting is that "given some corpus of natural language utterances, the frequency of any word is inversely proportional to its rank in the frequency table. Thus the most frequent word will occur approximately twice as often as the second most frequent word, three times as often as the third most frequent word, etc."
In other words, the rth most frequent word has a frequency f(r) that scales according to $${f(r)} \propto \frac{1}{r^\alpha}$$ for $$\alpha \approx {1}$$
Let's see how the tweet tokens and their frequencies look like on a plot.
if WorkspaceBasedCheckPt(3):
TermFreq_DF.sort_values(by='TotalFreq', ascending=False)['TotalFreq'][0]
max(TermFreq_DF['TotalFreq'])
if WorkspaceBasedCheckPt(3):
TopRanksToView = 200
exponent1 = 1
exponent2 = 0.65
plt.figure(figsize=(11,6))
WordRank = np.arange(start = 1, stop = TopRanksToView + 1, step=1)
ConstantOfProportionality = max(TermFreq_DF['TotalFreq'])
Expected_Zipf1 = [ConstantOfProportionality*(1/r**exponent1) for r in WordRank]
Expected_Zipf2 = [ConstantOfProportionality*(1/r**exponent2) for r in WordRank]
plt.plot(WordRank, Expected_Zipf1, color= 'r', linestyle= '--', linewidth = 2, alpha= 0.8, label = "exponent = 1")
plt.plot(WordRank, Expected_Zipf2, color= 'orange', linestyle= '--', linewidth = 2, alpha= 0.8, label = "exponent = 0.65")
TextFreq = TermFreq_DF.sort_values(by='TotalFreq', ascending=False)['TotalFreq'][:TopRanksToView]
plt.bar(WordRank, TextFreq, width = 1, align= 'center', alpha=0.8, label = "Actual")
plt.ylabel('Frequency')
plt.xlabel('Rank')
plt.title('Top {} tokens in tweets'.format(TopRanksToView))
plt.legend()
plt.grid(True)
plt.show()
On the X-axis is the rank of the frequency from highest rank from left up to 500th rank to the right. Y-axis is the frequency observed in the corpus (in this case, "Sentiment140" dataset). One thing to note is that the actual observations in most cases does not strictly follow Zipf's distribution, but rather follow the trend of "near-Zipfian" distribution.
Even though we can see the plot follows the trend of Zipf's Law, but it looks like it has more area above the expected Zipf curve in higher ranked words.
Another way to plot this is on a log-log graph, with X-axis being log(rank), Y-axis being log(frequency). By plotting on the log-log scale the result will yield roughly linear line on the graph.
if WorkspaceBasedCheckPt(3):
from pylab import *
counts = TermFreq_DF.TotalFreq
tokens = TermFreq_DF.index
ranks = arange(1, len(counts)+1)
indices = argsort(-counts)
frequencies = counts[indices]
plt.figure(figsize=(8,6))
plt.ylim(1,10**6)
plt.xlim(1,10**6)
loglog(ranks, frequencies, marker=".")
plt.plot([1,frequencies[0]],[frequencies[0],1],color='r')
title("Zipf plot for tweets tokens")
xlabel("Frequency rank of token")
ylabel("Absolute frequency of token")
grid(True)
for n in list(logspace(-0.5, log10(len(counts)-2), 25).astype(int)):
dummy = text(ranks[n], frequencies[n], " " + tokens[indices[n]],
verticalalignment="bottom",
horizontalalignment="left")
## Saving to csv
This Time Removing stopwords and limiting the max_features
if WorkspaceBasedCheckPt(3):
from sklearn.feature_extraction.text import CountVectorizer
CntVec = CountVectorizer(stop_words='english',max_features=10000)
CntVec.fit(train_DF['clean_text'])
if WorkspaceBasedCheckPt(3):
len(CntVec.get_feature_names())
if WorkspaceBasedCheckPt(3):
NegativeDoc_matrix = CntVec.transform(train_DF['clean_text'].loc[train_DF['sentiment_class'] == 0])
PositiveDoc_matrix = CntVec.transform(train_DF['clean_text'].loc[train_DF['sentiment_class'] == 1])
print(type(NegativeDoc_matrix), NegativeDoc_matrix.shape)
Negative_tf = np.sum(NegativeDoc_matrix,axis=0)
Positive_tf = np.sum(PositiveDoc_matrix,axis=0)
print(type(Negative_tf), Negative_tf.shape)
Negative = np.squeeze(np.asarray(Negative_tf))
Positive = np.squeeze(np.asarray(Positive_tf))
print(type(Negative), Negative.shape)
TermFreq_DF2 = pd.DataFrame([Negative,Positive], columns=CntVec.get_feature_names()).transpose()
print(type(TermFreq_DF2), TermFreq_DF.shape)
TermFreq_DF2.rename(columns = {0:'Negative', 1:'Positive'}, inplace= True)
TermFreq_DF2['TotalFreq'] = TermFreq_DF2['Negative'] + TermFreq_DF2['Positive']
print('DataFrame Shape:', TermFreq_DF2.shape)
TermFreq_DF2.sort_values(by='TotalFreq', ascending=False).head(15)
if WorkspaceBasedCheckPt(3):
TopRanksToView = 50
Rank = np.arange(TopRanksToView)
plt.figure(figsize=(14,7))
DataToUse = TermFreq_DF2.sort_values(by='Negative', ascending=False)['Negative'][:TopRanksToView]
plt.bar(Rank, DataToUse, align='center', alpha=0.8, color = 'red')
plt.xticks(Rank, DataToUse.index,rotation='vertical')
plt.ylabel('Frequency')
plt.xlabel('Top {} Negative tokens'.format(TopRanksToView))
plt.title('Top {} tokens in Negative tweets'.format(TopRanksToView))
plt.show()
if WorkspaceBasedCheckPt(3):
TopRanksToView = 50
Rank = np.arange(TopRanksToView)
plt.figure(figsize=(14,7))
DataToUse = TermFreq_DF2.sort_values(by='Positive', ascending=False)['Positive'][:TopRanksToView]
plt.bar(Rank, DataToUse, align='center', alpha=0.8, color = 'green')
plt.xticks(Rank, DataToUse.index,rotation='vertical')
plt.ylabel('Frequency')
plt.xlabel('Top {} Positive tokens'.format(TopRanksToView))
plt.title('Top {} tokens in Positive tweets'.format(TopRanksToView))
plt.show()
if WorkspaceBasedCheckPt(3):
import seaborn as sns
plt.figure(figsize=(8,6))
ax = sns.regplot(x="Negative", y="Positive",fit_reg=False, scatter_kws={'alpha':0.5}, data= TermFreq_DF2)
plt.ylabel('Positive Frequency')
plt.xlabel('Negative Frequency')
plt.title('Negative Frequency vs Positive Frequency')
plt.grid(True)
plt.show()
if WorkspaceBasedCheckPt(3):
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
output_notebook()
p = figure(x_axis_label='Negative', y_axis_label='Positive', plot_width=650, plot_height=400, x_range=(0, 70000), y_range=(0, 65000))
# p.y_range = Range1d(0, 60000)
linexy = np.arange(max(TermFreq_DF2.Positive)*2)
p.line(linexy, linexy, line_width=2, color = 'black')
p.circle(x = 'Negative',y = 'Positive', source= TermFreq_DF2.loc[TermFreq_DF2['Positive'] < TermFreq_DF2['Negative']], color = 'red', size=5, alpha=0.6)
p.circle(x = 'Negative',y = 'Positive', source= TermFreq_DF2.loc[TermFreq_DF2['Positive'] >= TermFreq_DF2['Negative']], color = 'green', size=5, alpha=0.6)
from bokeh.models import HoverTool
hover = HoverTool(tooltips=[('token','@index')])
p.add_tools(hover)
show(p)
Most of the words are below 10,000 on both X-axis and Y-axis, and we cannot see a meaningful relation between negative and positive frequency.
In order to come up with a meaningful metric which can characterise important tokens in each class, I borrowed a metric presented by Jason Kessler in PyData 2017 Seattle. In the talk, he presented a Python library called Scattertext. Even though I did not make use of the library, the metrics used in the Scattertext as a way of visualising text data are very useful in filtering meaningful tokens from the frequency data.
Intuitively, if a word appears more often in one class compared to another, this can be a good measure of how much the word is meaningful to characterise the class. In the below code I named it as 'pos_rate', and as you can see from the calculation of the code, this is defined as $${pos\_rate} = \frac{positive\ frequency} {positive\ frequency + negative\ frequency}$$
if WorkspaceBasedCheckPt(3):
TermFreq_DF2['PositiveRate'] = TermFreq_DF2['Positive'] * 1./TermFreq_DF2['TotalFreq']
display(TermFreq_DF2.sort_values(by='PositiveRate', ascending=False).head(10))
Words with highest pos_rate have zero frequency in the negative tweets, but overall frequency of these words are too low to think of it as a guideline for positive tweets.
Another metric is the frequency a words occurs in the class. This is defined as $${pos\_freq\_pct} = \frac {positive\ frequency} {\Sigma positive\ frequency}$$
if WorkspaceBasedCheckPt(3):
TermFreq_DF2['PositiveFreq_pct'] = TermFreq_DF2['Positive'] * 1./TermFreq_DF2['Positive'].sum()
TermFreq_DF2.sort_values(by='PositiveFreq_pct', ascending=False).head(10)
But since pos_freq_pct is just the frequency scaled over the total sum of the frequency, the rank of pos_freq_pct is exactly same as just the positive frequency.
What we can do now is to combine pos_rate, pos_freq_pct together to come up with a metric which reflects both pos_rate and pos_freq_pct. Even though both of these can take a value ranging from 0 to 1, pos_rate has much wider range actually spanning from 0 to 1, while all the pos_freq_pct values are squashed within the range smaller than 0.015. If we average these two numbers, pos_rate will be too dominant, and will not reflect both metrics effectively.
So here we use harmonic mean instead of arithmetic mean. "Since the harmonic mean of a list of numbers tends strongly toward the least elements of the list, it tends (compared to the arithmetic mean) to mitigate the impact of large outliers and aggravate the impact of small ones." The harmonic mean H of the positive real number x1,x2,...xn is defined as $${H} = \frac {n}{\sum_{i=1}^{n}\ \frac{1}{x_i}}$$
if WorkspaceBasedCheckPt(3):
from scipy.stats import hmean
TermFreq_DF2['Positive_hmean'] = TermFreq_DF2.apply(lambda x: (hmean([x['PositiveRate'], x['PositiveFreq_pct']])
if x['PositiveRate'] > 0 and x['PositiveFreq_pct'] > 0
else 0), axis=1)
TermFreq_DF2.sort_values(by='Positive_hmean', ascending=False).head(10)
The harmonic mean rank seems like the same as pos_freq_pct. By calculating the harmonic mean, the impact of small value (in this case, pos_freq_pct) is too aggravated and ended up dominating the mean value. This is again exactly same as just the frequency value rank and doesn't provide a meaningful result.
What we can try next is to get the CDF (Cumulative Distribution Function) value of both pos_rate and pos_freq_pct. CDF can be explained as "distribution function of X, evaluated at x, is the probability that X will take a value less than or equal to x". By calculating CDF value, we can see where the value of either pos_rate or pos_freq_pct lies in the distribution in terms of cumulative manner. In the below result of the code, we can see a word "welcome" with pos_rate_normcdf of 0.995625, and pos_freq_pct_normcdf of 0.999354. This means roughly 99.56% of the tokens will take a pos_rate value less than or equal to 0.91535, and 99.99% will take a pos_freq_pct value less than or equal to 0.001521.
Next, we calculate a harmonic mean of these two CDF values, as we did earlier. By calculating the harmonic mean, we can see that pos_normcdf_hmean metric provides a more meaningful measure of how important a word is within the class.
if WorkspaceBasedCheckPt(3):
from scipy.stats import norm
def normcdf(x):
return norm.cdf(x, x.mean(), x.std())
TermFreq_DF2['PositiveRate_normcdf'] = normcdf(TermFreq_DF2['PositiveRate'])
TermFreq_DF2['PositiveFreq_pct_normcdf'] = normcdf(TermFreq_DF2['PositiveFreq_pct'])
TermFreq_DF2['Positive_normcdf_hmean'] = hmean([TermFreq_DF2['PositiveRate_normcdf'], TermFreq_DF2['PositiveFreq_pct_normcdf']])
TermFreq_DF2.sort_values(by='Positive_normcdf_hmean', ascending=False).head(10)
if WorkspaceBasedCheckPt(3):
TermFreq_DF2['NegativeRate'] = TermFreq_DF2['Negative'] * 1./TermFreq_DF2['TotalFreq']
TermFreq_DF2['NegativeFreq_pct'] = TermFreq_DF2['Negative'] * 1./TermFreq_DF2['Negative'].sum()
TermFreq_DF2['Negative_hmean'] = TermFreq_DF2.apply(lambda x: (hmean([x['NegativeRate'], x['NegativeFreq_pct']])
if x['NegativeRate'] > 0 and x['NegativeFreq_pct'] > 0
else 0), axis=1)
TermFreq_DF2['NegativeRate_normcdf'] = normcdf(TermFreq_DF2['NegativeRate'])
TermFreq_DF2['NegativeFreq_pct_normcdf'] = normcdf(TermFreq_DF2['NegativeFreq_pct'])
TermFreq_DF2['Negative_normcdf_hmean'] = hmean([TermFreq_DF2['NegativeRate_normcdf'], TermFreq_DF2['NegativeFreq_pct_normcdf']])
TermFreq_DF2.sort_values(by='Negative_normcdf_hmean', ascending=False).head(10)
if WorkspaceBasedCheckPt(3):
plt.figure(figsize=(8,6))
ax = sns.regplot(x="Negative_hmean", y="Positive_hmean",fit_reg=False, scatter_kws={'alpha':0.5},data=TermFreq_DF2)
plt.ylabel('Positive Rate and Frequency Harmonic Mean')
plt.xlabel('Negative Rate and Frequency Harmonic Mean')
plt.title('Negative_hmean vs Positive_hmean')
if WorkspaceBasedCheckPt(3):
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
output_notebook()
p = figure(x_axis_label='Negative Rate and Frequency Harmonic Mean', y_axis_label='Positive Rate and Frequency Harmonic Mean', plot_width=650, plot_height=400, x_range=(0, 0.03), y_range=(0, 0.03))
p.line([0,1], [0,1], line_width=2, color = 'black')
p.circle(x = 'Negative_hmean',y = 'Positive_hmean', source= TermFreq_DF2.loc[TermFreq_DF2['Positive_hmean'] < TermFreq_DF2['Negative_hmean']], color = 'red', size=5, alpha=0.6)
p.circle(x = 'Negative_hmean',y = 'Positive_hmean', source= TermFreq_DF2.loc[TermFreq_DF2['Positive_hmean'] >= TermFreq_DF2['Negative_hmean']], color = 'green', size=5, alpha=0.6)
from bokeh.models import HoverTool
hover = HoverTool(tooltips=[('token','@index')])
p.add_tools(hover)
show(p)
# from bokeh.plotting import figure
# from bokeh.io import output_notebook, show
# from bokeh.models import LinearColorMapper
# output_notebook()
# color_mapper = LinearColorMapper(palette='Inferno256', low= min(TermFreq_DF2.Positive_hmean), high=max(TermFreq_DF2.Positive_hmean))
# p = figure(x_axis_label='Negative_hmean', y_axis_label='Positive_hmean')
# p.circle('Negative_hmean','Positive_hmean',size=5,alpha=0.3,source= TermFreq_DF2, color={'field': 'Positive_hmean', 'transform': color_mapper})
# from bokeh.models import HoverTool
# hover = HoverTool(tooltips=[('token','@index')])
# p.add_tools(hover)
# show(p)
if WorkspaceBasedCheckPt(3):
plt.figure(figsize=(8,6))
ax = sns.regplot(x="Negative_normcdf_hmean", y="Positive_normcdf_hmean",fit_reg=False, scatter_kws={'alpha':0.5},data=TermFreq_DF2)
plt.ylabel('Positive Rate and Frequency CDF Harmonic Mean')
plt.xlabel('Negative Rate and Frequency CDF Harmonic Mean')
plt.title('Negative_normcdf_hmean vs Positive_normcdf_hmean')
It seems like the harmonic mean of rate CDF and frequency CDF has created an interesting pattern on the plot. If a data point is near to upper left corner, it is more positive, and if it is closer to the bottom right corner, it is more negative.
It is good that the metric has created some meaningful insight out of frequency, but with text data, showing every token as just a dot is lacking important information on which token each data point represents. With 10,000 points, it is difficult to annotate all of the points on the plot. For this part, I have tried several methods and came to a conclusion that it is not very practical or feasible to directly annotate data points on the plot.
So I took an alternative method of the interactive plot with Bokeh. Bokeh is an interactive visualisation library for Python, which creates graphics in style of D3.js. Bokeh can output the result in HTML format or also within the Jupyter Notebook. And below is the plot created with Bokeh.
if WorkspaceBasedCheckPt(3):
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.models import LinearColorMapper
output_notebook()
color_mapper = LinearColorMapper(palette='Inferno256', low= min(TermFreq_DF2.Positive_normcdf_hmean), high=max(TermFreq_DF2.Positive_normcdf_hmean))
p = figure(x_axis_label='Negative_normcdf_hmean', y_axis_label='Positive_normcdf_hmean')
p.line([0,1], [0,1], line_width=2, color = 'black')
p.circle('Negative_normcdf_hmean','Positive_normcdf_hmean',size=5,alpha=0.6,source= TermFreq_DF2,color={'field': 'Positive_normcdf_hmean', 'transform': color_mapper})
from bokeh.models import HoverTool
hover = HoverTool(tooltips=[('token','@index')])
p.add_tools(hover)
show(p)
With above Bokeh plot, you can see what token each data point represents by hovering over the points. For example, the points in the top left corner show tokens like "thank", "welcome", "congrats", etc. And some of the tokens in bottom right corner are "sad", "hurts", "died", "sore", etc. And the colour of each dot is organised in "Inferno256" colour map in Python, so yellow is the most positive, while black is the most negative, and the colour gradually goes from black to purple to orange to yellow, as it goes from negative to positive.
Depending on which model I will use later for classification of positive and negative tweets, this metric can also come in handy.
### Load the File
Before we can train any model, we first consider how to split the data. Here I chose to split the data into three chunks: train, development, test. I referenced Andrew Ng's "deeplearning.ai" course on how to split the data.
Train set: The sample of data used for learning
Development set (Hold-out cross-validation set): The sample of data used to tune the parameters of a classifier, and provide an unbiased evaluation of a model.
Test set: The sample of data used only to assess the performance of a final model.
The ratio I decided to split my data is 98/1/1, 98% of data as the training set, and 1% for the dev set, and the final 1% for the test set. The rationale behind this ratio comes from the size of my whole data set. The dataset has more than 1.5 million entries. In this case, only 1% of the whole data gives me more than 15,000 entries. This is more than enough to evaluate the model and refine the parameters.
Another approach is splitting the data into only train and test set, and run k-fold cross-validation on the training set, so that you can have an unbiased evaluation of a model. But considering the size of the data, I have decided to use the train set only to train a model, and evaluate on the dev set, so that I can quickly test different algorithms and run this process iteratively.
x = my_df.text
y = my_df.target
from sklearn.cross_validation import train_test_split
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)
print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train), (len(x_train[y_train == 0]) / (len(x_train)*1.))*100, (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation), (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100, (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test), (len(x_test[y_test == 0]) / (len(x_test)*1.))*100, (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))
When comparing various machine learning algorithms, baseline provides a point of reference to compare. The most popular baseline is the Zero Rule (ZeroR). ZeroR classifier simply predicts the majority category (class). Although there is no predictability power in ZeroR, it is useful for determining a baseline performance as a benchmark for other classification methods. As you can see from the above validation set class division, the majority class is negative with 50.40%, which means if a classifier predicts negative for every validation data, it will get 50.40% accuracy.
Another baseline I wanted to compare the validation results with is TextBlob. Textblob is a python library for processing textual data. Apart from other useful tools such as POS tagging, n-gram, The package has built-in sentiment classification. This is a so-called out-of-the-box sentiment analysis tool, and in addition to the null accuracy, I will also keep in mind of the accuracy I get from TextBlob sentiment analysis to see how my model is performing.
from textblob import TextBlob
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
%%time
tbresult = [TextBlob(i).sentiment.polarity for i in x_validation]
tbpred = [0 if n < 0 else 1 for n in tbresult]
conmat = np.array(confusion_matrix(y_validation, tbpred, labels=[1,0]))
confusion = pd.DataFrame(conmat, index=['positive', 'negative'],
columns=['predicted_positive','predicted_negative'])
print "Accuracy Score: {0:.2f}%".format(accuracy_score(y_validation, tbpred)*100)
print "-"*80
print "Confusion Matrix\n"
print confusion
print "-"*80
print "Classification Report\n"
print classification_report(y_validation, tbpred)
If we want to use text in machine learning algorithms, we’ll have to convert them to a numerical representation. One of the methods is called bag-of-words approach. The bag of words model ignores grammar and order of words. Once we have a corpus (text data) then first, a list of vocabulary is created based on the entire corpus. Then each document or data entry is represented as numerical vectors based on the vocabulary built from the corpus.
With count vectorizer, we merely count the appearance of the words in each text. For example, let's say we have 3 documents in a corpus: "I love dogs", "I hate dogs and knitting", "Knitting is my hobby and my passion". If we build vocabulary from these three sentences and represent each document as count vectors, it will look like below pictures.
title
But if the size of the corpus gets big, the number of vocabulary gets too big to process. With my 1.5 million tweets, if I build vocabulary without limiting the number of vocabulary, I will have more than 260,000 vocabularies. This means that the shape of training data will be around 1,500,000 x 260,000, this sounds too big to train various different models with. So I decided to limit the number of vocabularies, but I also wanted to see how the performance varies depending on the number of vocabularies.
Another thing I wanted to explore is stopwords. Stop Words are words which do not contain important significance, such as "the", "of", etc. It is often assumed that removing stopwords is a necessary step, and will improve the model performance. But I wanted to see for myself if this is really the case. So I ran the same test with and without stop words and compared the result. In addition, I also defined my custom stopwords list, which contains top 10 most frequent words in the corpus: "to", "the", "my", "it", "and", "you", "not", "is", "in", "for".
A model I chose to evaluate different count vectors is the logistic regression. It is one of the linear models, so computationally scalable to big data, compared to models like KNN or random forest. And once I have the optimal number of features and make a decision on whether to remove stop words or not, then I will try different models with the chosen number of vocabularies' count vectors.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from time import time
Below I define two functions to iteratively train on a different number of features, then check the accuracy of logistic regression on the validation set.
def accuracy_summary(pipeline, x_train, y_train, x_test, y_test):
if len(x_test[y_test == 0]) / (len(x_test)*1.) > 0.5:
null_accuracy = len(x_test[y_test == 0]) / (len(x_test)*1.)
else:
null_accuracy = 1. - (len(x_test[y_test == 0]) / (len(x_test)*1.))
t0 = time()
sentiment_fit = pipeline.fit(x_train, y_train)
y_pred = sentiment_fit.predict(x_test)
train_test_time = time() - t0
accuracy = accuracy_score(y_test, y_pred)
print "null accuracy: {0:.2f}%".format(null_accuracy*100)
print "accuracy score: {0:.2f}%".format(accuracy*100)
if accuracy > null_accuracy:
print "model is {0:.2f}% more accurate than null accuracy".format((accuracy-null_accuracy)*100)
elif accuracy == null_accuracy:
print "model has the same accuracy with the null accuracy"
else:
print "model is {0:.2f}% less accurate than null accuracy".format((null_accuracy-accuracy)*100)
print "train and test time: {0:.2f}s".format(train_test_time)
print "-"*80
return accuracy, train_test_time
cvec = CountVectorizer()
lr = LogisticRegression()
n_features = np.arange(10000,100001,10000)
def nfeature_accuracy_checker(vectorizer=cvec, n_features=n_features, stop_words=None, ngram_range=(1, 1), classifier=lr):
result = []
print (classifier)
print "\n"
for n in n_features:
vectorizer.set_params(stop_words=stop_words, max_features=n, ngram_range=ngram_range)
checker_pipeline = Pipeline([
('vectorizer', vectorizer),
('classifier', classifier)
])
print "Validation result for {} features".format(n)
nfeature_accuracy,tt_time = accuracy_summary(checker_pipeline, x_train, y_train, x_validation, y_validation)
result.append((n,nfeature_accuracy,tt_time))
return result
%%time
print "RESULT FOR UNIGRAM WITHOUT STOP WORDS\n"
feature_result_wosw = nfeature_accuracy_checker(stop_words='english')
%%time
print "RESULT FOR UNIGRAM WITH STOP WORDS\n"
feature_result_ug = nfeature_accuracy_checker(
csv = 'term_freq_df.csv'
term_freq_df = pd.read_csv(csv,index_col=0)
term_freq_df.sort_values(by='total', ascending=False).iloc[:10]
from sklearn.feature_extraction import text
a = frozenset(list(term_freq_df.sort_values(by='total', ascending=False).iloc[:10].index))
b = text.ENGLISH_STOP_WORDS
set(a).issubset(set(b))
my_stop_words = frozenset(list(term_freq_df.sort_values(by='total', ascending=False).iloc[:10].index))
%%time
print "RESULT FOR UNIGRAM WITHOUT CUSTOM STOP WORDS (Top 10 frequent words)\n"
feature_result_wocsw = nfeature_accuracy_checker(stop_words=my_stop_words)
nfeatures_plot_ug = pd.DataFrame(feature_result_ug,columns=['nfeatures','validation_accuracy','train_test_time'])
nfeatures_plot_ug_wocsw = pd.DataFrame(feature_result_wocsw,columns=['nfeatures','validation_accuracy','train_test_time'])
nfeatures_plot_ug_wosw = pd.DataFrame(feature_result_wosw,columns=['nfeatures','validation_accuracy','train_test_time'])
plt.figure(figsize=(8,6))
plt.plot(nfeatures_plot_ug.nfeatures, nfeatures_plot_ug.validation_accuracy, label='with stop words')
plt.plot(nfeatures_plot_ug_wocsw.nfeatures, nfeatures_plot_ug_wocsw.validation_accuracy,label='without custom stop words')
plt.plot(nfeatures_plot_ug_wosw.nfeatures, nfeatures_plot_ug_wosw.validation_accuracy,label='without stop words')
plt.title("Without stop words VS With stop words (Unigram): Accuracy")
plt.xlabel("Number of features")
plt.ylabel("Validation set accuracy")
plt.legend()
By looking at the evaluation result, removing stop words did not improve the model performance, but keeping the stop words yielded better performance. I wouldn't say that removing stopwords are not helping the model performance every time, but as empirical findings, in this particular setting, keeping the stop words improve the model performance.
According to Wikipedia, "n-gram is a contiguous sequence of n items from a given sequence of text or speech". In other words, n-grams are simply all combinations of adjacent words or letters of length n that you can find in your source text. Below picture represents well how n-grams are constructed out of source text.
In this project, I will extend the bag-of-words to trigrams, and see if it affects the performance.
%%time
print "RESULT FOR BIGRAM WITH STOP WORDS\n"
feature_result_bg = nfeature_accuracy_checker(ngram_range=(1, 2))
%%time
print "RESULT FOR TRIGRAM WITH STOP WORDS\n"
feature_result_tg = nfeature_accuracy_checker(ngram_range=(1, 3))
Now let's visualise the results we got from unigram, bigram, and trigram.
nfeatures_plot_tg = pd.DataFrame(feature_result_tg,columns=['nfeatures','validation_accuracy','train_test_time'])
nfeatures_plot_bg = pd.DataFrame(feature_result_bg,columns=['nfeatures','validation_accuracy','train_test_time'])
nfeatures_plot_ug = pd.DataFrame(feature_result_ug,columns=['nfeatures','validation_accuracy','train_test_time'])
plt.figure(figsize=(8,6))
plt.plot(nfeatures_plot_tg.nfeatures, nfeatures_plot_tg.validation_accuracy,label='trigram')
plt.plot(nfeatures_plot_bg.nfeatures, nfeatures_plot_bg.validation_accuracy,label='bigram')
plt.plot(nfeatures_plot_ug.nfeatures, nfeatures_plot_ug.validation_accuracy, label='unigram')
plt.title("N-gram(1~3) test result : Accuracy")
plt.xlabel("Number of features")
plt.ylabel("Validation set accuracy")
plt.legend()
The best validation set accuracy for each n-gram is as below.
unigram: 80,000 & 90,000 features at validation accuracy 80.28%
bigram: 70,000 features at validation accuracy 82.25%
trigram: 80,000 features at validation accuracy 82.44%
Below I defined another function to take a closer look at best performing number of features with each n-gram. Below function not only reports accuracy but also gives confusion matrix and classification report.
def train_test_and_evaluate(pipeline, x_train, y_train, x_test, y_test):
if len(x_test[y_test == 0]) / (len(x_test)*1.) > 0.5:
null_accuracy = len(x_test[y_test == 0]) / (len(x_test)*1.)
else:
null_accuracy = 1. - (len(x_test[y_test == 0]) / (len(x_test)*1.))
sentiment_fit = pipeline.fit(x_train, y_train)
y_pred = sentiment_fit.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
conmat = np.array(confusion_matrix(y_test, y_pred, labels=[0,1]))
confusion = pd.DataFrame(conmat, index=['negative', 'positive'],
columns=['predicted_negative','predicted_positive'])
print "null accuracy: {0:.2f}%".format(null_accuracy*100)
print "accuracy score: {0:.2f}%".format(accuracy*100)
if accuracy > null_accuracy:
print "model is {0:.2f}% more accurate than null accuracy".format((accuracy-null_accuracy)*100)
elif accuracy == null_accuracy:
print "model has the same accuracy with the null accuracy"
else:
print "model is {0:.2f}% less accurate than null accuracy".format((null_accuracy-accuracy)*100)
print "-"*80
print "Confusion Matrix\n"
print confusion
print "-"*80
print "Classification Report\n"
print classification_report(y_test, y_pred, target_names=['negative','positive'])
Before I run the defined function, let me briefly explain about confusion matrix and classification report. In order to evaluate the performance of a model, there are many different metrics that can be used. Below I will talk in case of binary classification, in which the target variable only has two classes to be predicted. In the case of this project, the classes are either "negative" or "positive".
One obvious measure of performance can be accuracy. It is the number of times the model predicted correctly for the class over the number of the whole data set. But in case of classification, this can be broken down further. Below is a representation of confusion matrix.
title
In the above matrix, each row represents the instances in an actual class while each column represents the instances in a predicted class, and it can be also presented swapping rows and columns (column for the actual class, row for predicted class). So the accuracy (ACC) I mentioned above can be expressed as below. $${ACC} = \frac {True Positive + True Negative}{Positive + Negative} = \frac {True Positive + True Negative}{True Positive + False Positive + True Negative + False Negative}$$
When the distribution of the classes in data is well balanced, accuracy can give you a good picture of how the model is performing. But when you have skewed data, for example, one of the class is dominant in your data set, then accuracy might not be enough to evaluate your model. Let's say you have a dataset which contains 80% positive class, and 20% negative class. This means that by predicting every data into the positive class, the model will get 80% accuracy. In this case, you might want to explore further into the confusion matrix and try different evaluation metrics.
There can be 9 different metrics, just from the combination of numbers from confusion matrix, but I will talk about two of them in particular, and another metric which combines these two.
"Precision" (also called Positive Predictive Value) tells you what proportion of data predicted as positive actually is positive. In other words, the proportion of True Positive in the set of all positive predicted data. $${PPV(Precision)} = \frac {True Positive}{True Positive + False Positive}$$
"Recall" (also called Sensitivity, Hit Rate, True Positive Rate) tells you what proportion of data that actually is positive were predicted positive. In other words, the proportion of True Positive in the set of all actual positive data. $${TPR(Recall)} = \frac {True Positive}{Positive} = \frac {True Positive}{True Positive + False Negative}$$
Below is the image of confusion matrix of cancer diagnose. If you think of "cancer" as positive class, "no cancer" as a negative class, the image explains well how to think of precision and recall in terms of the confusion matrix. title
And finally, the F1 score is the harmonic mean of precision and recall. The harmonic mean is a specific type of average, which is used when dealing with averages of units, like rates and ratios. So by calculating the harmonic mean of the two metrics, it will give you a good idea of how the model is performing both in terms of precision and recall. The formula is as below $${F1} = 2\cdot\frac {Precision\cdot Recall}{Precision + Recall}$$
%%time
ug_cvec = CountVectorizer(max_features=80000)
ug_pipeline = Pipeline([
('vectorizer', ug_cvec),
('classifier', lr)
])
train_test_and_evaluate(ug_pipeline, x_train, y_train, x_validation, y_validation)
%%time
bg_cvec = CountVectorizer(max_features=70000,ngram_range=(1, 2))
bg_pipeline = Pipeline([
('vectorizer', bg_cvec),
('classifier', lr)
])
train_test_and_evaluate(bg_pipeline, x_train, y_train, x_validation, y_validation)
%%time
tg_cvec = CountVectorizer(max_features=80000,ngram_range=(1, 3))
tg_pipeline = Pipeline([
('vectorizer', tg_cvec),
('classifier', lr)
])
train_test_and_evaluate(tg_pipeline, x_train, y_train, x_validation, y_validation)
From the above classification reports, we can see that model has slightly higher precision in negative class and higher recall in positive class. But this averages out by calculating the F1 score, and for both classes, we get the almost same F1 score for both positive and negative class. There is also a way to visualise the model performance by plotting ROC curve, but I will explain more in detail later.
TF-IDF is another way to convert textual data to a numeric form and is short for Term Frequency-Inverse Document Frequency. The vector value it yields is the product of these two terms; TF and IDF.
Let's first look at Term Frequency. We have already looked at term frequency above with count vectorizer, but this time, we need one more step to calculate the relative frequency. Let's say we have two documents in total as below.
I love dogs
I hate dogs and knitting
Relative term frequency is calculated for each term within each document as below.
$${TF(t,d)} = \frac {number\ of\ times\ term(t)\ appears\ in\ document(d)}{total\ number\ of\ terms\ in\ document(d)}$$
For example, if we calculate relative term frequency for 'I' in both document 1 and document 2, it will be as below.
$${TF('I',d1)} = \frac {1}{3} \approx {0.33}$$$${TF('I',d2)} = \frac {1}{5} = {0.2}$$
Next, we need to get Inverse Document Frequency, which measures how important a word is to differentiate each document by following the calculation as below. $${IDF(t,D)} = \log \Big(\frac {total\ number\ of\ documents(D)}{number\ of\ documents\ with\ the\ term(t)\ in\ it}\Big)$$
If we calculate inverse document frequency for 'I', $${IDF('I',D)} = \log \Big(\frac {2}{2}\Big) = {0}$$
Once we have the values for TF and IDF, now we can calculate TFIDF as below. $${TFIDF(t,d,D)} = {TF(t,d)}\cdot{IDF(t,D)}$$
Following the case of our example, TFIDF for term 'I' in both documents will be as below. $${TFIDF('I',d1,D)} = {TF('I',d1)}\cdot{IDF('I',D)} = {0.33}\times{0} = {0}$$$${TFIDF('I',d2,D)} = {TF('I',d2)}\cdot{IDF('I',D)} = {0.2}\times{0} = {0}$$
As you can see, the term 'I' appeared equally in both documents, and the TFIDF score is 0, which means the term is not really informative in differentiating documents. The rest is same as count vectorizer, TFIDF vectorizer will calculate these scores for terms in documents, and convert textual data into a numeric form.
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer()
%%time
print "RESULT FOR UNIGRAM WITH STOP WORDS (Tfidf)\n"
feature_result_ugt = nfeature_accuracy_checker(vectorizer=tvec)
%%time
print "RESULT FOR BIGRAM WITH STOP WORDS (Tfidf)\n"
feature_result_bgt = nfeature_accuracy_checker(vectorizer=tvec,ngram_range=(1, 2))
%%time
print "RESULT FOR TRIGRAM WITH STOP WORDS (Tfidf)\n"
feature_result_tgt = nfeature_accuracy_checker(vectorizer=tvec,ngram_range=(1, 3))
It seems like TFIDF vectorizer is yielding better results when fed to logistic regression. Let's plot the results from count vectorizer together with TFIDF vectorizer.
nfeatures_plot_tgt = pd.DataFrame(feature_result_tgt,columns=['nfeatures','validation_accuracy','train_test_time'])
nfeatures_plot_bgt = pd.DataFrame(feature_result_bgt,columns=['nfeatures','validation_accuracy','train_test_time'])
nfeatures_plot_ugt = pd.DataFrame(feature_result_ugt,columns=['nfeatures','validation_accuracy','train_test_time'])
plt.figure(figsize=(8,6))
plt.plot(nfeatures_plot_tgt.nfeatures, nfeatures_plot_tgt.validation_accuracy,label='trigram tfidf vectorizer',color='royalblue')
plt.plot(nfeatures_plot_tg.nfeatures, nfeatures_plot_tg.validation_accuracy,label='trigram count vectorizer',linestyle=':', color='royalblue')
plt.plot(nfeatures_plot_bgt.nfeatures, nfeatures_plot_bgt.validation_accuracy,label='bigram tfidf vectorizer',color='orangered')
plt.plot(nfeatures_plot_bg.nfeatures, nfeatures_plot_bg.validation_accuracy,label='bigram count vectorizer',linestyle=':',color='orangered')
plt.plot(nfeatures_plot_ugt.nfeatures, nfeatures_plot_ugt.validation_accuracy, label='unigram tfidf vectorizer',color='gold')
plt.plot(nfeatures_plot_ug.nfeatures, nfeatures_plot_ug.validation_accuracy, label='unigram count vectorizer',linestyle=':',color='gold')
plt.title("N-gram(1~3) test result : Accuracy")
plt.xlabel("Number of features")
plt.ylabel("Validation set accuracy")
plt.legend()
From above chart, we can see including bigram and trigram boost the model performance both in count vectorizer and TFIDF vectorizer. And for every case of unigram to trigram, TFIDF yields better results than count vectorizer.
The best result I can get with logistic regression was by using TFIDF vectorizer of 100,000 features including up to trigram. With this I will first fit various different models and compare their validation results, then I will build an ensemble (voting) classifier with top 5 models.
I haven't included some of computationally expensive models, such as KNN, random forest, considering the size of data and the scalability of models. And the fine-tuning of models will come after I try some other different vectorisation of textual data.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import NearestCentroid
from sklearn.feature_selection import SelectFromModel
names = ["Logistic Regression", "Linear SVC", "LinearSVC with L1-based feature selection","Multinomial NB",
"Bernoulli NB", "Ridge Classifier", "AdaBoost", "Perceptron","Passive-Aggresive", "Nearest Centroid"]
classifiers = [
LogisticRegression(),
LinearSVC(),
Pipeline([
('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
('classification', LinearSVC(penalty="l2"))]),
MultinomialNB(),
BernoulliNB(),
RidgeClassifier(),
AdaBoostClassifier(),
Perceptron(),
PassiveAggressiveClassifier(),
NearestCentroid()
]
zipped_clf = zip(names,classifiers)
tvec = TfidfVectorizer()
def classifier_comparator(vectorizer=tvec, n_features=10000, stop_words=None, ngram_range=(1, 1), classifier=zipped_clf):
result = []
vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
for n,c in classifier:
checker_pipeline = Pipeline([
('vectorizer', vectorizer),
('classifier', c)
])
print "Validation result for {}".format(n)
print c
clf_accuracy,tt_time = accuracy_summary(checker_pipeline, x_train, y_train, x_validation, y_validation)
result.append((n,clf_accuracy,tt_time))
return result
%%time
trigram_result = classifier_comparator(n_features=100000,ngram_range=(1,3))
from sklearn.ensemble import VotingClassifier
clf1 = LogisticRegression()
clf2 = LinearSVC()
clf3 = MultinomialNB()
clf4 = RidgeClassifier()
clf5 = PassiveAggressiveClassifier()
eclf = VotingClassifier(estimators=[('lr', clf1), ('svc', clf2), ('mnb', clf3), ('rcs', clf4), ('pac', clf5)], voting='hard')
for clf, label in zip([clf1, clf2, clf3, clf4, clf5, eclf], ['Logistic Regression', 'Linear SVC', 'Multinomial NB', 'Ridge Classifier', 'Passive Aggresive Classifier', 'Ensemble']):
checker_pipeline = Pipeline([
('vectorizer', TfidfVectorizer(max_features=100000,ngram_range=(1, 3))),
('classifier', clf)
])
print "Validation result for {}".format(label)
print clf
clf_accuracy,tt_time = accuracy_summary(checker_pipeline, x_train, y_train, x_validation, y_validation)
It seems like the voting classifier does no better than the simple logistic regression model. Thus later part, I will try to finetune logistic regression model. But before that, I would like to try another method of sentiment classification.
What I have demonstrated above are machine learning approaches to text classification problem, which tries to solve the problem by training classifiers on the labelled data set. Another famous approach to sentiment analysis task is a lexical approach. "In the lexical approach the definition of sentiment is based on the analysis of individual words and/or phrases; emotional dictionaries are often used: emotional lexical items from the dictionary are searched in the text, their sentiment weights are calculated, and some aggregated weight function is applied." http://www.dialog-21.ru/media/1226/blinovpd.pdf
In the previous part, I have calculated harmonic mean of "positive rate CDF" and "positive frequency percent CDF", and these have given me a good representation of positive and negative terms in the corpora. If it successfully filters which terms are important to each class, then this can also be used for prediction in a lexical manner.
So I decided to make a simple predictor, which make use of the harmonic mean value I calculated. Below I go through the term frequency calculation, and the steps to get 'pos_normcdf_hmean', but this time I calculated term frequency only from the train set. (* Since I learned that I don't need to transform sparse matrix to dense matrix for term frequency calculation, I computed the frequency directly from sparse matrix)
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer(max_features=10000)
cvec.fit(x_train)
neg_train = x_train[y_train == 0]
pos_train = x_train[y_train == 1]
neg_doc_matrix = cvec.transform(neg_train)
pos_doc_matrix = cvec.transform(pos_train)
%%time
neg_tf = np.sum(neg_doc_matrix,axis=0)
pos_tf = np.sum(pos_doc_matrix,axis=0)
from scipy.stats import hmean
from scipy.stats import norm
def normcdf(x):
return norm.cdf(x, x.mean(), x.std())
neg = np.squeeze(np.asarray(neg_tf))
pos = np.squeeze(np.asarray(pos_tf))
term_freq_df2 = pd.DataFrame([neg,pos],columns=cvec.get_feature_names()).transpose()
term_freq_df2.columns = ['negative', 'positive']
term_freq_df2['total'] = term_freq_df2['negative'] + term_freq_df2['positive']
term_freq_df2['pos_rate'] = term_freq_df2['positive'] * 1./term_freq_df2['total']
term_freq_df2['pos_freq_pct'] = term_freq_df2['positive'] * 1./term_freq_df2['positive'].sum()
term_freq_df2['pos_rate_normcdf'] = normcdf(term_freq_df2['pos_rate'])
term_freq_df2['pos_freq_pct_normcdf'] = normcdf(term_freq_df2['pos_freq_pct'])
term_freq_df2['pos_normcdf_hmean'] = hmean([term_freq_df2['pos_rate_normcdf'], term_freq_df2['pos_freq_pct_normcdf']])
term_freq_df2.sort_values(by='pos_normcdf_hmean', ascending=False).iloc[:10]
The calculation of the positivity score I decided is fairly simple and straightforward. For each word in a document, look it up in the list of 10,000 words I built vocabulary with, and get the corresponding 'pos_normcdf_hmean' value, then for the document calculate the average 'pos_normcdf_hmean' value. If none of the words can be found from the built 10,000 terms, then yields random probability ranging between 0 to 1. And the single value I get for a document is handled as a probability of the document being positive class.
Normally, a lexical approach will take many other aspects into the calculation to refine the prediction result, but I will try a very simple model.
pos_hmean = term_freq_df2.pos_normcdf_hmean
pos_hmean['wtf']
y_val_predicted_proba = []
for t in x_validation:
hmean_scores = [pos_hmean[w] for w in t.split() if w in pos_hmean.index]
if len(hmean_scores) > 0:
prob_score = np.mean(hmean_scores)
else:
prob_score = np.random.random()
y_val_predicted_proba.append(prob_score)
pred = [1 if t > 0.56 else 0 for t in y_val_predicted_proba]
from sklearn.metrics import accuracy_score
accuracy_score(y_validation,pred)
The accuracy is not as good as logistic regression with count vectorizer or TFIDF vectorizer, but compared to null accuracy, 25.56% more accurate, and even compared to TextBlob sentiment analysis, my simple custom lexicon model is 15.31% more accurate. This is an impressive result for such a simple calculation and also considering the fact that the 'pos_normcdf_hmean' is calculated only with the training set. This might be useful later for an ensemble classifier.
import tweepy
import hidden
import sqlite3
from langdetect import detect
auth = tweepy.OAuthHandler(hidden.consumer_key, hidden.consumer_secret)
auth.set_access_token(hidden.token_key, hidden.token_secret)
api = tweepy.API(auth)
places = api.geo_search(query="London", granularity="city")
place_id_L = places[0].id
print('London id is: ',place_id_L)
places = api.geo_search(query="Brighton", granularity="city")
place_id_B = places[0].id
print('Brighton id is: ',place_id_B)
places = api.geo_search(query="Edinburgh", granularity="city")
place_id_E = places[0].id
print('Edinburgh id is: ',place_id_E)
maxitems = 10
print "London tweets retrieve testing"
print '----------------------------------'
for tweet in tweepy.Cursor(api.search, q="place:%s" % place_id_L).items(maxitems):
print tweet.text
maxitems = 10
print "Brighton tweets retrieve testing"
print '----------------------------------'
for tweet in tweepy.Cursor(api.search, q="place:%s" % place_id_B).items(maxitems):
print tweet.text
maxitems = 10
print "Edinburgh tweets retrieve testing"
print '---------------------------------'
for tweet in tweepy.Cursor(api.search, q="place:%s" % place_id_E).items(maxitems):
print tweet.text
conn = sqlite3.connect('twitter_testing.sqlite')
cur = conn.cursor()
cur.executescript('''
CREATE TABLE Tweets_London (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
user_id TEXT,
user_name TEXT,
user_timezone TEXT,
user_language TEXT,
detected_language TEXT,
tweet_text TEXT,
tweet_created TEXT
)
''')
for tweet in tweepy.Cursor(api.search, q="place:%s" % place_id_L).items(maxitems):
detected = detect(tweet.text)
cur.execute('''INSERT OR IGNORE INTO Tweets_London (
user_id, user_name, user_timezone, user_language, detected_language, tweet_text, tweet_created
)
VALUES ( ?,?,?,?,?,?,? )''', (tweet.user.id,tweet.user.screen_name,tweet.user.time_zone,tweet.user.lang,detected,tweet.text,tweet.created_at))
conn.commit()
from_sql = pd.read_sql_query("SELECT * FROM Tweets_London;", conn)
from_sql
## Saving the WorkSpace
# workspace('Save')
workspace('Load')